{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n=========================================================\nPipelining: chaining a PCA and a logistic regression\n=========================================================\n\nThe PCA does an unsupervised dimensionality reduction, while the logistic\nregression does the prediction.\n\nWe use a GridSearchCV to set the dimensionality of the PCA\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "print(__doc__)\n\n\n# Code source: Ga\u00ebl Varoquaux\n# Modified for documentation by Jaques Grobler\n# License: BSD 3 clause\n\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nfrom sklearn import datasets\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import GridSearchCV\n\n\n# Define a pipeline to search for the best combination of PCA truncation\n# and classifier regularization.\npca = PCA()\n# set the tolerance to a large value to make the example faster\nlogistic = LogisticRegression(max_iter=10000, tol=0.1)\npipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])\n\nX_digits, y_digits = datasets.load_digits(return_X_y=True)\n\n# Parameters of pipelines can be set using \u2018__\u2019 separated parameter names:\nparam_grid = {\n    'pca__n_components': [5, 15, 30, 45, 64],\n    'logistic__C': np.logspace(-4, 4, 4),\n}\nsearch = GridSearchCV(pipe, param_grid, n_jobs=-1)\nsearch.fit(X_digits, y_digits)\nprint(\"Best parameter (CV score=%0.3f):\" % search.best_score_)\nprint(search.best_params_)\n\n# Plot the PCA spectrum\npca.fit(X_digits)\n\nfig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))\nax0.plot(np.arange(1, pca.n_components_ + 1),\n         pca.explained_variance_ratio_, '+', linewidth=2)\nax0.set_ylabel('PCA explained variance ratio')\n\nax0.axvline(search.best_estimator_.named_steps['pca'].n_components,\n            linestyle=':', label='n_components chosen')\nax0.legend(prop=dict(size=12))\n\n# For each number of components, find the best classifier results\nresults = pd.DataFrame(search.cv_results_)\ncomponents_col = 'param_pca__n_components'\nbest_clfs = results.groupby(components_col).apply(\n    lambda g: g.nlargest(1, 'mean_test_score'))\n\nbest_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',\n               legend=False, ax=ax1)\nax1.set_ylabel('Classification accuracy (val)')\nax1.set_xlabel('n_components')\n\nplt.xlim(-1, 70)\n\nplt.tight_layout()\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}